#Packages
#1. Load data
#2. Descriptive statistics
##2.1. Number of factor levels of variables
length(unique(db$ID))
## [1] 15253
length(unique(db$Gender))
## [1] 2
length(unique(db$Inst_HUN))
## [1] 52
length(unique(db$Pr_area_ENG))
## [1] 32
length(unique(db$Pr_HUN))
## [1] 398
length(unique(db$Pr_ENG_bachelor))
## [1] 111
length(unique(db$Grad_level))
## [1] 7
length(unique(db$FEOR4))
## [1] 402
length(unique(db$FEOR3))
## [1] 109
length(unique(db$ISCO1))
## [1] 9
length(unique(db$ISCO3))
## [1] 115
length(unique(db$Req_HEd))
## [1] 2
length(unique(db$Head_couty_HUN))
## [1] 0
length(unique(db$Weekly_hrs))
## [1] 49
length(unique(db$Mountly_wage_HUF))
## [1] 7769
##2.2. Number of graduates in different groups
## [1] 15253
##
## A B E F M O P
## 921 7402 3252 1889 1334 46 409
## [1] 45
## [1] 15
## [1] 110
## [1] 372
## [1] 106
##2.3 Distribution of graduates working in occupation category that requires higher education degree
##
## 0 1
## A 0.61129207 0.38870793
## B 0.31234801 0.68765199
## E 0.15221402 0.84778598
## F 0.33086289 0.66913711
## M 0.09520240 0.90479760
## O 0.08695652 0.91304348
## P 0.32518337 0.67481663
0: occupations that does NOT require higher education 1: occupations that does require higher education
A: higher vocational trainings B: bachelor E: univerity (before Bologna) F: collage (before Bologna) M: master O: undivided training (e.g loyar, doctor) P: special teacher training
##2.4. Distribution of bachelor graduates working in an occupation that requires higher education degree (Fig1)
##
## 0 1
## Agricultural Science 0.3925620 0.6074380
## Art Education 0.3200000 0.6800000
## Arts 0.2291667 0.7708333
## Arts and Humanities 0.4260204 0.5739796
## Computer Science and Information Technology 0.1082474 0.8917526
## Economic Science 0.3412810 0.6587190
## Engineering Science 0.1844106 0.8155894
## Health Science 0.1239193 0.8760807
## Law 0.3333333 0.6666667
## Military sciences 0.2058824 0.7941176
## Natural sciences 0.4038462 0.5961538
## Religious Science 0.2500000 0.7500000
## Social Science 0.3614035 0.6385965
## Sport Science 0.4666667 0.5333333
## Teacher Training 0.3250774 0.6749226
##2.5. Distribution of graduates that work on occupation which requiring higher education degree by counties in Hungary (Fig2)
##
## 0 1
## HU101 829 1748
## HU102 165 287
## HU211 50 108
## HU212 55 64
## HU213 52 81
## HU221 95 169
## HU222 34 68
## HU223 29 57
## HU231 29 50
## HU232 21 44
## HU233 16 39
## HU311 73 154
## HU312 35 56
## HU313 15 23
## HU321 98 202
## HU322 39 110
## HU323 72 148
## HU331 49 113
## HU332 65 135
## HU333 86 175
## not NUTS important taxpayer 379 1206
## not NUTS special tax 26 53
##2.6. Gender pay gap (Fig.3, Fig.4)
Lollipop chart on wage differences by gender
## Using Mountly_wage_HUF as value column: use value.var to override.
## Using Mountly_wage_HUF as value column: use value.var to override.
##2.7. Strongest connections (Table4)
## Var1
## 1 Business management
## 2 Mechanical Engineer
## 3 IT engineer
## 4 Tourism
## 5 Electric engineer
## 6 Finance and Accounting
## 7 Nursing and Patient Care
## 8 Nursing and Patient Care
## 9 Software Engineering
## 10 Finance and Accounting
## Occupation value exp diff
## 1 Financial and mathematical associate professionals 153 49 104
## 2 Engineering professionals (excluding electrotechnology) 114 13 101
## 3 Software and applications developers and analysts 102 7 95
## 4 Client information workers 83 16 67
## 5 Electrotechnology engineers 65 3 62
## 6 Financial and mathematical associate professionals 80 18 62
## 7 Other health professionals 55 2 53
## 8 Nurses and midwives 47 2 45
## 9 Software and applications developers and analysts 48 4 44
## 10 Numerical and material recording clerks 49 7 42
##2.8. Weakest connections (Table5)
## Var1
## 1 Business management
## 2 Communication and Media Science
## 3 Business management
## 4 Tourism
## 5 Electric engineer
## 6 Business management
## 7 Andragogy
## 8 Electric engineer
## 9 Finance and Accounting
## 10 Commerce and marketing
## Occupation value exp diff
## 1 Engineering professionals (excluding electrotechnology) 21 48 -27
## 2 Engineering professionals (excluding electrotechnology) 6 26 -20
## 3 Software and applications developers and analysts 9 28 -19
## 4 Engineering professionals (excluding electrotechnology) 1 20 -19
## 5 Business services agents 3 21 -18
## 6 Physical and engineering science technicians 9 26 -17
## 7 Engineering professionals (excluding electrotechnology) 4 20 -16
## 8 Financial and mathematical associate professionals 1 15 -14
## 9 Engineering professionals (excluding electrotechnology) 3 17 -14
## 10 Engineering professionals (excluding electrotechnology) 1 15 -14
#3. Analysis of degree distributions
##3.1. “Program set” of bipartite graph (Fig. 5)
The initial distribution of data can be seen on figure below.
###Initial estimation
This estimation calcualate \(K_{min}\), \(\gamma\) and makes CDF and determine minimum \(D\). With {poweRlaw} functions very easy to achive suggested steps 1-3.
## [1] 26
## [1] 1.930942
## [1] 0.09800509
###Scanning whole range
And the fitted power-law on CDF curve.
###Investigate goodness of fit
Initial estimated \(\gamma\) and \(K_{min}\) denoted with blue colour on the figure.
## [1] 0.074
The \(p\) value is 0.074. If \(p\) value more than 1% means that null hypothesis cannot be rejected maybe it is a power-law distribution.
###Fitting real distribution
This figure compares different distributions. (red: power-law, green: lognormal, blue: Poisson, magenta: exponential)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
Compare lognormal and powerlaw distribution with Vuong’s test
## [1] 0.6281597
##3.2. “Occupation set” of bigraph (Fig. 6)
The initial distribution of data can be seen on figure below.
It is very similar to power-law distribution but let it see with {powerLaw}.
###Initial estimation
This estimation calcualate \(K_{min}\), \(\gamma\) and makes CDF and determine minimum \(D\). With {poweRlaw} functions very easy to achive suggested steps 1-3.
## [1] 63
## [1] 2.177573
## [1] 0.1159359
###Scanning whole range
And the fitted power-law on CDF curve.
###Investigate goodness of fit
Initial estimated \(\gamma\) and \(K_{min}\) denoted with blue colour on the figure.
## [1] 0.1
The \(p\) value is 0.1. If \(p\) value more than 1% means that null hypothesis cannot be rejected maybe it is a power-law distribution.
###Fitting real distribution
This figure compares different distributions. (red: power-law, green: lognormal, blue: Poisson, magenta: exponential)
Compare lognormal and powerlaw distribution with Vuong’s test
## [1] 0.2749639
#4. Node measures
##4.1. Centralities
## Warning: `evcent()` was deprecated in igraph 2.0.0.
## ℹ Please use `eigen_centrality()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `list.vertex.attributes()` was deprecated in igraph 2.0.0.
## ℹ Please use `vertex_attr_names()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `get.adjlist()` was deprecated in igraph 2.0.0.
## ℹ Please use `as_adj_list()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `bipartite.projection()` was deprecated in igraph 2.0.0.
## ℹ Please use `bipartite_projection()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `get.edgelist()` was deprecated in igraph 2.0.0.
## ℹ Please use `as_edgelist()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: `page.rank()` was deprecated in igraph 2.0.0.
## ℹ Please use `page_rank()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## nodes occ degree.weighted degree betweenness.w
## 1 Nursing and Patient Care 0 192 29 319.692
## 2 Andragogy 0 367 65 1951.548
## 3 Business management 0 874 71 3714.669
## 4 Electric engineer 0 258 31 812.092
## 5 Tourism 0 364 55 1079.122
## 6 Communication and Media Science 0 469 55 1705.610
## betweenness eigen.centr.w eigen.centr power.centr.w power.centr
## 1 205.2932 0.03406125 0.4782548 -0.3633216 -0.47515436
## 2 1270.4400 0.30107024 0.8684517 -0.3696028 -0.73715677
## 3 1480.1857 1.00000000 0.9251576 -0.5914092 -0.45960988
## 4 580.4062 0.09377294 0.4275998 -0.4443834 0.45832985
## 5 815.1195 0.30629223 0.7842568 -0.6436639 -0.07243149
## 6 876.7196 0.37082755 0.7415642 -0.5505395 0.42883356
## transitivity.w transitivity closeness.w closeness pagerank.w pagerank
## 1 2.816069 0.2221909 0.001926782 0.001926782 0.01111493 0.01111493
## 2 10.818187 0.2151061 0.002331002 0.002331002 0.02176081 0.02176081
## 3 23.980222 0.2159743 0.002386635 0.002386635 0.04571265 0.04571265
## 4 8.903414 0.1923984 0.001956947 0.001956947 0.01578785 0.01578785
## 5 11.200653 0.2187802 0.002227171 0.002227171 0.02070522 0.02070522
## 6 13.757914 0.2032963 0.002207506 0.002207506 0.02662744 0.02662744
## knn.w
## 1 101.9426
## 2 102.4841
## 3 102.2990
## 4 100.2730
## 5 102.8040
## 6 102.7469
##4.2. Distance distribution
## [1] 2.552741
fitted equation: function(d) \(k*exp(-1/2*(d-mu)^2/sigma^2)\)
k = 0.1904586 mu = 2.5388282 sigma = 0.8250279
The variance of distances is sigma=0.8250279, indicating that most path legths are in a close vicinity of mean d. These are manifestations of the small world property. [Barabasi: Network science, 2015]
##4.3. Farthest nodes
## [1] "831 - Architecture "
## [1] "752 - Architecture "
#5. Clustering methods
##5.1. Create purified graph Purified with configuration model (alpha=0.5)
Number of connection in original graph and purified graph: * before: L = 7402 * after: L_pur = 7148
##5.2. Louvain (Fig. 8)
###Table 6.
## 1 2 3 4 5
## 5 0.988 0.889 0.297 0.422 2.061
## 4 0.533 0.735 0.293 3.971 0.739
## 3 0.470 0.602 3.199 0.214 0.355
## 2 0.637 3.810 0.589 0.650 0.871
## 1 1.738 0.658 0.409 0.439 0.948
###Germanistics An expample
##
## Administrative and specialised secretaries
## 0.02083333
## Authors; journalists and linguists
## 0.04166667
## Business services agents
## 0.08333333
## Client information workers
## 0.06250000
## Database and network professionals
## 0.10416667
## Engineering professionals (excluding electrotechnology)
## 0.04166667
## Financial and mathematical associate professionals
## 0.02083333
## General office clerks
## 0.04166667
## Information and communications technology operations and user support technicians
## 0.08333333
## Legal professionals
## 0.04166667
## Machinery mechanics and repairers
## 0.02083333
## Mining and mineral processing plant operators
## 0.02083333
## Other clerical support workers
## 0.12500000
## Other health associate professionals
## 0.02083333
## Other services managers
## 0.02083333
## Other teaching professionals
## 0.02083333
## Physical and engineering science technicians
## 0.02083333
## Professional services managers
## 0.02083333
## Regulatory government associate professionals
## 0.02083333
## Sales and purchasing agents and brokers
## 0.04166667
## Secondary education teachers
## 0.06250000
## Secretaries (general)
## 0.02083333
## Social and religious professionals
## 0.04166667
##5.3. BRIM - Barber algorithm (Fig. 9)
This algorithm has a bootstrapping step. That is why there is no two exactly same figure. But modules are the same. Seed was not set.
#6. Multi-resolution method
##6.1. Reamain L after purification (Fig. 7)
L <- rep(0, 101)
for (i in 0:100){
j <- i/10
DF.pur <- melt(A)
DF.pur <- DF.pur[DF.pur$value!=0,]
DF.pur$Var2 <- as.factor(DF.pur$Var2)
degree.tr <- data.frame(rownames(A),rowSums(A))
names(degree.tr) <- c("names", "degree.tr")
degree.oc <- data.frame(colnames(A),colSums(A))
names(degree.oc) <- c("names", "degree.oc")
DF.pur <- left_join(DF.pur, degree.tr, by=c("Var1"="names"))
DF.pur <- left_join(DF.pur, degree.oc, by=c("Var2"="names"))
DF.pur$exp <- round((DF.pur$degree.tr * DF.pur$degree.oc * j) / sum(A)) #(k_i*k_j)/(L)
DF.pur$diff <- DF.pur$value - DF.pur$exp
DF.pur$sparse <- ifelse(DF.pur$diff>=0, DF.pur$value, 0)
L[i+1] <- sum(DF.pur$sparse)
}
L.df <- data.frame(alpha=seq(0,10, by=0.1), L=L)
L.df$L <- L.df$L/7402
ggplot(L.df) + geom_point(aes(x=alpha, y=L)) +
ylab(expression(L[p]/L)) + xlab(expression(alpha)) +
scale_x_continuous(breaks=seq(0,10,by=1)) +
theme_bw()
#ggsave("Fig_Lp_vs_alpha.pdf", height=4.5, width=7)
##6.2. Multi-resolution method with Louvain: training programs
###Scanning gamma (Fig. 10)
Q.var <- rep(0, 101)
clust <- matrix(0, nrow=223, ncol=101) #223 node, 101 gamma value
L <- rep(0, 101)
connections <- rep(0, 101)
for (i in 0:100){
j <- i/10
DF.pur <- melt(A)
DF.pur <- DF.pur[DF.pur$value!=0,]
DF.pur$Var2 <- as.factor(DF.pur$Var2)
degree.tr <- data.frame(rownames(A),rowSums(A))
names(degree.tr) <- c("names", "degree.tr")
degree.oc <- data.frame(colnames(A),colSums(A))
names(degree.oc) <- c("names", "degree.oc")
DF.pur <- left_join(DF.pur, degree.tr, by=c("Var1"="names"))
DF.pur <- left_join(DF.pur, degree.oc, by=c("Var2"="names"))
DF.pur$exp <- round((DF.pur$degree.tr * DF.pur$degree.oc * j) / sum(A)) #(k_i*k_j)/(L)
DF.pur$diff <- DF.pur$value - DF.pur$exp
DF.pur$sparse <- ifelse(DF.pur$diff>=0, DF.pur$value, 0)
L[i+1] <- sum(DF.pur$sparse)
connections[i+1] <- 2054-sum(DF.pur$sparse==0)
A.pur <- acast(DF.pur, Var1 ~ Var2, value.var = "sparse")
A.pur[is.na(A.pur)] <- 0
g.pur <- graph.incidence(A.pur, directed=F, weighted = T)
lou <- cluster_louvain(g.pur)
clust[,i+1] <- lou$membership
Q.var[i+1] <- modularity(lou)
}
rownames(clust) <- lou$names
colnames(clust) <- seq(0, 10, 0.1)
###Number of clusters
C <- sapply(1:ncol(clust), function(x) max(clust[,x]))
C <- data.frame(seq(0,10,0.1), C)
colnames(C)[1] <- "gamma"
#ggplot(C, aes(x=gamma, y=C)) + geom_point() + theme_bw()
ggplot(C[1:50,], aes(x=gamma, y=C)) + geom_point(size=1.2) + theme_bw() +
annotate("segment", x = 1.8, xend = 1.2, y = 5, yend = 5, colour="blue", size=1, arrow=arrow()) +
annotate("segment", x = 4.2, xend = 3.6, y = 10, yend = 10, colour="blue", size=1, arrow=arrow()) +
labs(y="Number of clusters", x="alpha")
#ggsave("cikkbe_Fig_C_vs_gamma.pdf", height = 14, width=16, units="cm")
###Number of weights
L <- data.frame(seq(0,10,0.1), L)
colnames(L)[1] <- "gamma"
ggplot(L, aes(x=gamma, y=L)) + geom_point() + theme_bw()
ggplot(L[1:70,], aes(x=gamma, y=L)) + geom_point() + theme_bw()
###Number of links
connections <- data.frame(seq(0,10,0.1), connections)
colnames(connections)[1] <- "gamma"
ggplot(connections, aes(x=gamma, y=connections)) + geom_point() + theme_bw()
ggplot(connections[1:70,], aes(x=gamma, y=connections)) + geom_point() + theme_bw()
###Similarity of nodes
S <- matrix(0, nrow=223, ncol=223)
rownames(S) <- rownames(clust)
colnames(S) <- rownames(clust)
res1 <- 1
res2 <- 11
for (i in 1:223){
for (j in 1:223){
S[i,j] <- sum(clust[i,res1:res2]-clust[j,res1:res2]==0)
S[j,i] <- S[i,j]
}
}
S.pr <- S[1:110, 1:110]
S.pr <- S.pr/(res2-res1+1)
longData<-melt(S.pr)
longData<-longData[longData$value!=0,]
#o <- seriate(S.pr, method = "BEA_TSP")
#longData$Var1 <- factor(longData$Var1, levels=names(unlist(o[[1]][])))
#longData$Var2 <- factor(longData$Var2, levels=names(unlist(o[[1]][])))
#cl <- data.frame(names(unlist(o[[1]][])))
ord <- read.csv("ordering.csv")
#this ordering was saved for repeatability
longData$Var1 <- factor(longData$Var1, levels=ord[,2])
longData$Var2 <- factor(longData$Var2, levels=ord[,2])
cl <- data.frame(ord[,2])
colnames(cl)[1] <- "programs"
clust2 <- data.frame(rownames(clust), clust[,1])
rownames(clust2) <- 1:223
colnames(clust2) <- c("programs", "clusters")
cl <- left_join(cl, clust2, by="programs")
#color palette for axis
axis.x.colour <- (cl[,2] %>% plyr::mapvalues(from=c(1:10), to=c("#B40404", "#e68a00", "#0B6121", "#000099", "#0099cc", "#00cc99", "#6600cc", "#e600e6", "#999900", "#7094db")))
## The following `from` values were not present in `x`: 7, 8, 9, 10
#ordering <- names(unlist(o[[1]][]))
#write.csv(ordering, "ordering.csv")
ggplot(longData, aes(x = Var2, y = Var1)) +
geom_raster(aes(fill=value)) +
scale_fill_gradient(low="grey60", high="red") +
labs(x="", y="", title="") +
theme_bw() +
theme(axis.text.x=element_text(size=5, angle=90, vjust=0.2, hjust=1),
axis.text.y=element_text(size=5),
legend.text=element_text(size=8),
legend.title=element_text(size=8)) +
theme(axis.text.x=element_text(colour=axis.x.colour)) +
theme(axis.text.y=element_text(colour=axis.x.colour)) +
annotate("segment", x = 23.5, xend = 23.5, y = 0, yend = 110.5, colour = "blue", size = 0.5) +
annotate("segment", x = 42.5, xend = 42.5, y = 0, yend = 110.5, colour = "blue", size = 0.5) +
annotate("segment", x = 65.5, xend = 65.5, y = 0, yend = 110.5, colour = "blue", size = 0.5) +
annotate("segment", x = 95.5, xend = 95.5, y = 0, yend = 110.5, colour = "blue", size = 0.5) +
annotate("segment", x = 0, xend = 110.5, y = 23.5, yend = 23.5, colour = "blue", size = 0.5) +
annotate("segment", x = 0, xend = 110.5, y = 42.5, yend = 42.5, colour = "blue", size = 0.5) +
annotate("segment", x = 0, xend = 110.5, y = 65.5, yend = 65.5, colour = "blue", size = 0.5) +
annotate("segment", x = 0, xend = 110.5, y = 95.5, yend = 95.5, colour = "blue", size = 0.5)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
#ggsave("cikkbe_Fig_2.9-3.4.pdf", height = 22, width=23, units="cm")
###Programs in clusters table (Fig. 11)
table(clust[,30], clust[,1])
##
## 1 2 3 4 5 6
## 1 19 0 2 0 11 0
## 2 4 19 0 0 6 0
## 3 3 0 41 0 0 3
## 4 0 0 0 10 2 0
## 5 6 0 4 0 2 32
## 6 4 0 4 0 12 1
## 7 6 0 0 0 0 0
## 8 0 0 7 0 0 0
## 9 3 0 2 0 11 0
## 10 0 0 0 9 0 0
table(clust[1:110,30], clust[1:110,1]) #programs
##
## 1 2 3 4 5 6
## 1 12 0 1 0 6 0
## 2 1 7 0 0 5 0
## 3 1 0 22 0 0 1
## 4 0 0 0 4 1 0
## 5 2 0 2 0 1 15
## 6 3 0 3 0 6 1
## 7 3 0 0 0 0 0
## 8 0 0 4 0 0 0
## 9 1 0 0 0 4 0
## 10 0 0 0 4 0 0
table(clust[111:223,30], clust[111:223,1]) #occuptions
##
## 1 2 3 4 5 6
## 1 7 0 1 0 5 0
## 2 3 12 0 0 1 0
## 3 2 0 19 0 0 2
## 4 0 0 0 6 1 0
## 5 4 0 2 0 1 17
## 6 1 0 1 0 6 0
## 7 3 0 0 0 0 0
## 8 0 0 3 0 0 0
## 9 2 0 2 0 7 0
## 10 0 0 0 5 0 0
clu <- data.frame(
name=rownames(clust),
gamma0=clust[,1],
gamma2.9=clust[,30])
colnames(degree.oc) <- colnames(degree.tr)
degree <- rbind(degree.tr, degree.oc)
clu <- left_join(clu, degree, by=c("name"="names"))
clu15 <- clu[clu$degree.tr>15,]
ggplot(clu[clu$degree.tr>15,]) + geom_point(aes(x=gamma2.9, y=gamma0), color="red") +
geom_text_repel(aes(x=gamma2.9, y=gamma0, label=name), size=3) +
scale_x_discrete(limits=1:10) +
scale_y_discrete(limits=1:5) +
labs(x="alpha=2.9", y="alpha=0") +
theme_bw()
## Warning in scale_x_discrete(limits = 1:10): Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?
## Warning in scale_y_discrete(limits = 1:5): Continuous limits supplied to discrete scale.
## ℹ Did you mean `limits = factor(...)` or `scale_*_continuous()`?
## Warning: ggrepel: 121 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#ggsave("Fig_clusters.pdf", height = 22, width=42, units="cm")
##6.3. Multi-resolution method with Louvain: occupations
###Scanning gamma
Q.var <- rep(0, 101)
clust <- matrix(0, nrow=223, ncol=101) #223 node, 101 gamma value
L <- rep(0, 101)
connections <- rep(0, 101)
for (i in 0:100){
j <- i/10
DF.pur <- melt(A)
DF.pur <- DF.pur[DF.pur$value!=0,]
DF.pur$Var2 <- as.factor(DF.pur$Var2)
degree.tr <- data.frame(rownames(A),rowSums(A))
names(degree.tr) <- c("names", "degree.tr")
degree.oc <- data.frame(colnames(A),colSums(A))
names(degree.oc) <- c("names", "degree.oc")
DF.pur <- left_join(DF.pur, degree.tr, by=c("Var1"="names"))
DF.pur <- left_join(DF.pur, degree.oc, by=c("Var2"="names"))
DF.pur$exp <- round((DF.pur$degree.tr * DF.pur$degree.oc * j) / sum(A)) #(k_i*k_j)/(L)
DF.pur$diff <- DF.pur$value - DF.pur$exp
DF.pur$sparse <- ifelse(DF.pur$diff>=0, DF.pur$value, 0)
L[i+1] <- sum(DF.pur$sparse)
connections[i+1] <- 2054-sum(DF.pur$sparse==0)
A.pur <- acast(DF.pur, Var1 ~ Var2, value.var = "sparse")
A.pur[is.na(A.pur)] <- 0
g.pur <- graph.incidence(A.pur, directed=F, weighted = T)
lou <- cluster_louvain(g.pur)
clust[,i+1] <- lou$membership
Q.var[i+1] <- modularity(lou)
}
rownames(clust) <- lou$names
colnames(clust) <- seq(0, 10, 0.1)
###Similarity of nodes
S <- matrix(0, nrow=223, ncol=223)
rownames(S) <- rownames(clust)
colnames(S) <- rownames(clust)
res1 <- 30
res2 <- 35
for (i in 1:223){
for (j in 1:223){
S[i,j] <- sum(clust[i,res1:res2]-clust[j,res1:res2]==0)
S[j,i] <- S[i,j]
}
}
S.oc <- S[111:223, 111:223]
S.oc <- S.oc/(res2-res1+1)
longData<-melt(S.oc)
longData<-longData[longData$value!=0,]
#o <- seriate(S.oc, method = "BEA_TSP")
#longData$Var1 <- factor(longData$Var1, levels=names(unlist(o[[1]][])))
#longData$Var2 <- factor(longData$Var2, levels=names(unlist(o[[1]][])))
ord <- read.csv("ordering_occup.csv")
longData$Var1 <- factor(longData$Var1, levels=ord[,2])
longData$Var2 <- factor(longData$Var2, levels=ord[,2])
cl <- data.frame(names(unlist(ord[[1]][])))
cl <- data.frame(ord[,2])
colnames(cl)[1] <- "occupations"
cl$occupations <- as.character(cl$occupations)
clust2 <- data.frame(rownames(clust), clust[,30])
rownames(clust2) <- 1:223
colnames(clust2) <- c("programs", "clusters")
cl <- left_join(cl, clust2, by=c("occupations"="programs"))
#color palette for axis
axis.x.colour <- (cl[,2] %>% plyr::mapvalues(from=c(1:10), to=c("#B40404", "#e68a00", "#0B6121", "#000099", "#0099cc", "#00cc99", "#6600cc", "#e600e6", "#999900", "#7094db")))
## The following `from` values were not present in `x`: 10
#ordering <- names(unlist(o[[1]][]))
#write.csv(ordering, "ordering_occup.csv")
ggplot(longData, aes(x = Var2, y = Var1)) +
geom_raster(aes(fill=value)) +
scale_fill_gradient(low="grey60", high="red") +
labs(x="", y="", title="") +
theme_bw() +
theme(axis.text.x=element_text(size=5, angle=90, vjust=0.2, hjust=1),
axis.text.y=element_text(size=5),
legend.text=element_text(size=8),
legend.title=element_text(size=8)) +
theme(axis.text.x=element_text(colour=axis.x.colour)) +
theme(axis.text.y=element_text(colour=axis.x.colour)) +
annotate("segment", x = 20.5, xend = 20.5, y = 0, yend = 113.5, colour = "blue", size = 0.5) +
annotate("segment", x = 42.5, xend = 42.5, y = 0, yend = 113.5, colour = "blue", size = 0.5) +
annotate("segment", x = 68.5, xend = 68.5, y = 0, yend = 113.5, colour = "blue", size = 0.5) +
annotate("segment", x = 84.5, xend = 84.5, y = 0, yend = 113.5, colour = "blue", size = 0.5) +
annotate("segment", x = 0, xend = 113.5, y = 20.5, yend = 20.5, colour = "blue", size = 0.5) +
annotate("segment", x = 0, xend = 113.5, y = 42.5, yend = 42.5, colour = "blue", size = 0.5) +
annotate("segment", x = 0, xend = 113.5, y = 68.5, yend = 68.5, colour = "blue", size = 0.5) +
annotate("segment", x = 0, xend = 113.5, y = 84.5, yend = 84.5, colour = "blue", size = 0.5)
## Warning: Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
## Vectorized input to `element_text()` is not officially supported.
## ℹ Results may be unexpected or may change in future versions of ggplot2.
#ggsave("oc_Fig_2.9-3.4_v2.pdf", height = 22, width=23, units="cm")